package org.codelibs.elasticsearch.minhash.index.mapper;
import static org.elasticsearch.common.xcontent.support.XContentMapValues.isArray;
import static org.elasticsearch.common.xcontent.support.XContentMapValues.nodeStringValue;
import static org.elasticsearch.index.mapper.TypeParsers.parseField;
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.Base64;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.index.DocValuesType;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.IndexableFieldType;
import org.apache.lucene.search.Query;
import org.apache.lucene.store.ByteArrayDataOutput;
import org.apache.lucene.util.BytesRef;
import org.codelibs.minhash.MinHash;
import org.elasticsearch.ElasticsearchException;
import org.elasticsearch.common.bytes.BytesArray;
import org.elasticsearch.common.bytes.BytesReference;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.util.CollectionUtils;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.common.xcontent.XContentParser;
import org.elasticsearch.index.analysis.NamedAnalyzer;
import org.elasticsearch.index.fielddata.IndexFieldData;
import org.elasticsearch.index.fielddata.plain.BytesBinaryDVIndexFieldData;
import org.elasticsearch.index.mapper.FieldMapper;
import org.elasticsearch.index.mapper.MappedFieldType;
import org.elasticsearch.index.mapper.Mapper;
import org.elasticsearch.index.mapper.MapperParsingException;
import org.elasticsearch.index.mapper.ParseContext;
import org.elasticsearch.index.query.QueryShardContext;
import org.elasticsearch.index.query.QueryShardException;
import com.carrotsearch.hppc.ObjectArrayList;
public class MinHashFieldMapper extends FieldMapper {
public static final String CONTENT_TYPE = "minhash";
private NamedAnalyzer minhashAnalyzer;
private CopyBitsTo copyBitsTo;
public static class Defaults {
public static final MappedFieldType FIELD_TYPE = new MinHashFieldType();
static {
FIELD_TYPE.setIndexOptions(IndexOptions.NONE);
FIELD_TYPE.setStored(true);
FIELD_TYPE.freeze();
}
}
public static class Builder
extends FieldMapper.Builder<Builder, MinHashFieldMapper> {
private NamedAnalyzer minhashAnalyzer;
private CopyBitsTo copyBitsTo;
public Builder(String name) {
super(name, Defaults.FIELD_TYPE, Defaults.FIELD_TYPE);
builder = this;
}
@Override
public MinHashFieldMapper build(BuilderContext context) {
setupFieldType(context);
return new MinHashFieldMapper(name, fieldType, defaultFieldType,
context.indexSettings(),
multiFieldsBuilder.build(this, context), copyTo,
minhashAnalyzer, copyBitsTo);
}
public Builder minhashAnalyzer(final NamedAnalyzer minhashAnalyzer) {
this.minhashAnalyzer = minhashAnalyzer;
return builder;
}
public Builder copyBitsTo(final CopyBitsTo copyBitsTo) {
this.copyBitsTo = copyBitsTo;
return builder;
}
}
public static class TypeParser implements Mapper.TypeParser {
@Override
public Mapper.Builder parse(String name, Map<String, Object> node,
ParserContext parserContext) throws MapperParsingException {
MinHashFieldMapper.Builder builder = new MinHashFieldMapper.Builder(
name);
parseField(builder, name, node, parserContext);
for (Iterator<Map.Entry<String, Object>> iterator = node.entrySet()
.iterator(); iterator.hasNext();) {
Map.Entry<String, Object> entry = iterator.next();
String propName = entry.getKey();
Object propNode = entry.getValue();
if (propName.equals("minhash_analyzer") && propNode != null) {
final NamedAnalyzer analyzer = parserContext
.getIndexAnalyzers().get(propNode.toString());
builder.minhashAnalyzer(analyzer);
iterator.remove();
} else if (propName.equals("copy_bits_to")
&& propNode != null) {
parseCopyBitsFields(propNode, builder);
iterator.remove();
}
}
return builder;
}
}
public static void parseCopyBitsFields(Object propNode, Builder builder) {
CopyBitsTo.Builder copyToBuilder = new CopyBitsTo.Builder();
if (isArray(propNode)) {
for (Object node : (List<Object>) propNode) {
copyToBuilder.add(nodeStringValue(node, null));
}
} else {
copyToBuilder.add(nodeStringValue(propNode, null));
}
builder.copyBitsTo(copyToBuilder.build());
}
static final class MinHashFieldType extends MappedFieldType {
public MinHashFieldType() {
}
protected MinHashFieldType(MinHashFieldType ref) {
super(ref);
}
@Override
public MappedFieldType clone() {
return new MinHashFieldType(this);
}
@Override
public String typeName() {
return CONTENT_TYPE;
}
@Override
public BytesReference valueForDisplay(Object value) {
if (value == null) {
return null;
}
BytesReference bytes;
if (value instanceof BytesRef) {
bytes = new BytesArray((BytesRef) value);
} else if (value instanceof BytesReference) {
bytes = (BytesReference) value;
} else if (value instanceof byte[]) {
bytes = new BytesArray((byte[]) value);
} else {
bytes = new BytesArray(
Base64.getDecoder().decode(value.toString()));
}
return bytes;
}
@Override
public IndexFieldData.Builder fielddataBuilder() {
failIfNoDocValues();
return new BytesBinaryDVIndexFieldData.Builder();
}
@Override
public Query termQuery(Object value, QueryShardContext context) {
throw new QueryShardException(context,
"MinHash fields do not support searching");
}
}
protected MinHashFieldMapper(String simpleName, MappedFieldType fieldType,
MappedFieldType defaultFieldType, Settings indexSettings,
MultiFields multiFields, CopyTo copyTo,
NamedAnalyzer minhashAnalyzer, CopyBitsTo copyBitsTo) {
super(simpleName, fieldType, defaultFieldType, indexSettings,
multiFields, copyTo);
this.minhashAnalyzer = minhashAnalyzer;
this.copyBitsTo = copyBitsTo;
}
@Override
protected void parseCreateField(ParseContext context,
List<IndexableField> fields) throws IOException {
String value;
if (context.externalValueSet()) {
value = context.externalValue().toString();
} else {
XContentParser parser = context.parser();
if (parser.currentToken() == XContentParser.Token.VALUE_NULL) {
value = fieldType().nullValueAsString();
} else {
value = parser.textOrNull();
}
}
if (value == null) {
return;
}
byte[] minhashValue = MinHash.calculate(minhashAnalyzer, value);
if (fieldType().stored()) {
fields.add(
new Field(fieldType().name(), minhashValue, fieldType()));
}
if (fieldType().hasDocValues()) {
CustomMinHashDocValuesField field = (CustomMinHashDocValuesField) context
.doc().getByKey(fieldType().name());
if (field == null) {
field = new CustomMinHashDocValuesField(fieldType().name(),
minhashValue);
context.doc().addWithKey(fieldType().name(), field);
} else {
field.add(minhashValue);
}
}
if (copyBitsTo != null) {
parseCopyBitsFields(
context.createExternalValueContext(
MinHash.toBinaryString(minhashValue)),
copyBitsTo.copyBitsToFields);
}
}
/** Creates instances of the fields that the current field should be copied to */
private static void parseCopyBitsFields(ParseContext context,
List<String> copyToFields) throws IOException {
if (!context.isWithinCopyTo() && copyToFields.isEmpty() == false) {
context = context.createCopyToContext();
for (String field : copyToFields) {
// In case of a hierarchy of nested documents, we need to figure out
// which document the field should go to
ParseContext.Document targetDoc = null;
for (ParseContext.Document doc = context
.doc(); doc != null; doc = doc.getParent()) {
if (field.startsWith(doc.getPrefix())) {
targetDoc = doc;
break;
}
}
assert targetDoc != null;
final ParseContext copyToContext;
if (targetDoc == context.doc()) {
copyToContext = context;
} else {
copyToContext = context.switchDoc(targetDoc);
}
parseCopy(field, copyToContext);
}
}
}
/** Creates an copy of the current field with given field name and boost */
private static void parseCopy(String field, ParseContext context)
throws IOException {
FieldMapper fieldMapper = context.docMapper().mappers()
.getMapper(field);
if (fieldMapper != null) {
fieldMapper.parse(context);
}
}
@Override
protected String contentType() {
return CONTENT_TYPE;
}
@Override
protected void doMerge(Mapper mergeWith, boolean updateAllTypes) {
super.doMerge(mergeWith, updateAllTypes);
this.minhashAnalyzer = ((MinHashFieldMapper) mergeWith).minhashAnalyzer;
this.copyBitsTo = ((MinHashFieldMapper) mergeWith).copyBitsTo;
}
protected void doXContentBody(XContentBuilder builder,
boolean includeDefaults, Params params) throws IOException {
super.doXContentBody(builder, includeDefaults, params);
builder.field("minhash_analyzer", minhashAnalyzer.name());
if (copyBitsTo != null) {
copyBitsTo.toXContent(builder, params);
}
}
public static class CustomMinHashDocValuesField implements IndexableField {
public static final FieldType TYPE = new FieldType();
static {
TYPE.setDocValuesType(DocValuesType.BINARY);
TYPE.freeze();
}
private final ObjectArrayList<byte[]> bytesList;
private int totalSize = 0;
private final String name;
public CustomMinHashDocValuesField(String name, byte[] bytes) {
this.name = name;
bytesList = new ObjectArrayList<>();
add(bytes);
}
public void add(byte[] bytes) {
bytesList.add(bytes);
totalSize += bytes.length;
}
@Override
public BytesRef binaryValue() {
try {
CollectionUtils.sortAndDedup(bytesList);
int size = bytesList.size();
final byte[] bytes = new byte[totalSize + (size + 1) * 5];
ByteArrayDataOutput out = new ByteArrayDataOutput(bytes);
out.writeVInt(size); // write total number of values
for (int i = 0; i < size; i++) {
final byte[] value = bytesList.get(i);
int valueLength = value.length;
out.writeVInt(valueLength);
out.writeBytes(value, 0, valueLength);
}
return new BytesRef(bytes, 0, out.getPosition());
} catch (IOException e) {
throw new ElasticsearchException("Failed to get MinHash value",
e);
}
}
@Override
public String name() {
return name;
}
@Override
public IndexableFieldType fieldType() {
return TYPE;
}
@Override
public float boost() {
return 1f;
}
@Override
public String stringValue() {
return null;
}
@Override
public Reader readerValue() {
return null;
}
@Override
public Number numericValue() {
return null;
}
@Override
public TokenStream tokenStream(Analyzer analyzer, TokenStream reuse) {
return null;
}
}
public static class CopyBitsTo {
private final List<String> copyBitsToFields;
private CopyBitsTo(List<String> copyBitsToFields) {
this.copyBitsToFields = copyBitsToFields;
}
public XContentBuilder toXContent(XContentBuilder builder,
Params params) throws IOException {
if (!copyBitsToFields.isEmpty()) {
builder.startArray("copy_bits_to");
for (String field : copyBitsToFields) {
builder.value(field);
}
builder.endArray();
}
return builder;
}
public static class Builder {
private final List<String> copyBitsToBuilders = new ArrayList<>();
public Builder add(String field) {
copyBitsToBuilders.add(field);
return this;
}
public CopyBitsTo build() {
return new CopyBitsTo(
Collections.unmodifiableList(copyBitsToBuilders));
}
}
public List<String> copyBitsToFields() {
return copyBitsToFields;
}
}
}